Load packages

Load Tidied Data

Visualize the data

Data QA/QC plots

ggplot(all, aes(x=date, y=diff_n_cont, color=genet)) +
  geom_point() +
  facet_wrap(~treatment) +
  scale_color_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  theme_minimal()

Plot basic growth rate across entire experiment

ggplot(all, aes(x=date, y=growth_rate_cont, color=genet)) +
  facet_wrap(~treatment,
             labeller =  labeller(treatment = c("cold" = "Ambient", "severe" = "Severe MHW", "extreme" = "Extreme MHW"))) +
  geom_point(alpha=0.5) +
  geom_smooth(method = "lm", formula = y ~ x, se = F) +
  stat_poly_eq(use_label("eq"), formula = y ~ x, na.rm=TRUE) +
  scale_color_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  theme_minimal() +
  labs(x = "Date",
       y = "Cumulative population growth (no. polyps)",
       color = "Genet")

#ggsave(here("experiment", "figures", "growth_genet.png"), width=12, height=7)

Plot growth rate of all genets during and after MHW

ggplot(all, aes(x=date, y=growth_rate, color = treatment)) +
  geom_point(alpha=0.1) +
  facet_wrap(~mhw, scales="free", labeller = labeller(mhw = c("during" = "During MHW", "after" = "After MHW"))) +
  geom_smooth(method = "lm", formula = y ~ x, se = F) +
  stat_poly_eq(use_label("eq"), formula = y ~ x, parse = TRUE, label.x.npc = "left") +
  scale_color_manual(values = c("cold" = "#0072B2", "severe" = "#E69F00", "extreme" = "#D55E00"),
                     labels = c("cold" = "Ambient", "severe" = "Severe MHW", "extreme" = "Extreme MHW")) +
  theme_minimal() +
  coord_cartesian(ylim = c(-5, 25), expand=TRUE) +   #set equal y axes coordinates
  labs(x = "Date",
       y = "Cumulative population growth (no. polyps)",
       color = "Treatment")

#ggsave(here("experiment", "figures", "growth_MHW.png"), width=12, height=7)

Plot growth rate of each genet during and after MHW

ggplot(all, aes(x=date, y=growth_rate, color = genet)) +
  geom_point(alpha=0.1) +
  facet_grid(mhw~treatment, 
             #switch = 'x',
             scales = "free_x",
             labeller = labeller(mhw = c("during" = "During MHW", "after" = "After MHW"),
                                 treatment = c("cold" = "Ambient", "severe" = "Severe MHW", "extreme" = "Extreme MHW"))) +
  geom_smooth(method = "lm", formula = y ~ x, se = F) +
  stat_poly_eq(use_label("eq"), formula = y ~ x, parse = TRUE, label.x.npc = "left") +
  scale_color_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  theme_minimal() +
  #coord_cartesian(ylim = c(-5, 25), expand=TRUE) +   #set equal y axes coordinates
  labs(x = "Date",
       y = "Cumulative population growth (no. polyps)",
       color = "Genet")

  theme(strip.placement = "outside")
## List of 1
##  $ strip.placement: chr "outside"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
#ggsave(here("experiment", "figures", "growth_MHW_genet.png"), width=12, height=10)

Plot growth rate of each genet during and after MHW - but not faceted by MHW

label_plot <- c("0.13", "0.09", "0.071", "0.0522", "0.0761",
                "0.151", "0.0933", "0.0252", "0.116", "0.0413", #extreme during
                "0.166", "0.142", "0.0522", "0.134", "0.0902",
                "-0.0023", "0.00741", "0.026", "0.00843", "-0.0131",
                "0.0102", "0.00652", "0.00881", "0.00421", "-0.0117", #extreme after
                "-0.0104", "-0.000895", "0.0217", "0.0127", "-0.0131")
color_plot <- rep(c("#9370DB", "#C21B78", "#FF9933", "#FF3333", "#662B45"), times = 6)
treatment_plot <- rep(c(rep("cold", 5), rep("extreme", 5), rep("severe", 5)), times = 2)
date_plot <- c(rep("2023-10-01", 15), rep("2024-01-10", 15))

p <- ggplot(all, aes(x=date, y=growth_rate_cont, color=genet, shape=mhw)) +
  geom_point(alpha=0.15) +
  facet_wrap(~factor(treatment, levels=c("cold", "severe", "extreme"), labels=c("Ambient", "Severe MHW", "Extreme MHW"))) +   geom_rect(aes(xmin = as.POSIXct("2023-11-30"), xmax = as.POSIXct("2023-12-10"), ymin = -5, ymax = -4.5), color = "dimgray", fill = "dimgray", alpha = 0.4) +  # Add horizontal bar
    geom_rect(aes(xmin = as.POSIXct("2024-02-14"), xmax = as.POSIXct("2024-02-21"), ymin = -5, ymax = -4.5), color = "dimgray", fill = "dimgray", alpha = 0.4) +  # Add horizontal bar
  geom_vline(xintercept = as.POSIXct("2023-12-10"), linetype = "dashed") +  # Add vertical line delineating during/after MHW
  geom_smooth(method = "lm", se = FALSE) +
  #stat_poly_eq(use_label("eq"), formula = y ~ x, parse = TRUE) +  #un-comment this to get updated values to add to label_plot
  scale_color_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  scale_shape_manual(values = c("during" = 16, "after" = 17)) +  # Define shapes for mhw levels
  theme_minimal() +
  coord_cartesian(ylim = c(-5, 25), expand=TRUE) +
  labs(x = "Date",
       y = "Cumulative population growth (no. polyps)",
       color = "Genet") +
  guides(shape = "none")

loop_count = 0
for (i in 1:length(label_plot)) {
  p <- p + geom_text(data = data.frame(date = as.Date("2023-12-30"), treatment = treatment_plot[i], mhw = "during"), 
              label=paste0("m = ", label_plot[i]), x = as.Date(date_plot[i]), y = 25-loop_count, hjust = 0, vjust = 1, color=color_plot[i])
  loop_count <- loop_count + 1
  if (loop_count > 4) {
    loop_count <- 0  # Reset counter after every 5th iteration
  }
}
p

#ggsave(here("experiment", "figures", "growth_MHW_genet_combined.png"), width=12, height=7)

Plot raw population numbers

g <- ggplot(all, aes(x=date, y=n_true, color=genet, shape=mhw)) +
  geom_point(alpha=0.15) +
  facet_wrap(~factor(treatment, levels=c("cold", "severe", "extreme"), labels=c("Ambient", "Severe MHW", "Extreme MHW"))) +
  geom_rect(aes(xmin = as.POSIXct("2023-11-30"), xmax = as.POSIXct("2023-12-10"), ymin = 5, ymax = 5.5), color = "dimgray", fill = "dimgray", alpha = 0.2) +  # Add horizontal bar
  geom_rect(aes(xmin = as.POSIXct("2024-02-14"), xmax = as.POSIXct("2024-02-21"), ymin = 5, ymax = 5.5), color = "dimgray", fill = "dimgray", alpha = 0.2) +  # Add horizontal bar
  geom_vline(xintercept = as.POSIXct("2023-12-10"), linetype = "dashed", linewidth = 0.5) +  # Add vertical line delineating during/after MHW
  geom_smooth(method = "lm", se = FALSE) +
 # stat_poly_eq(use_label("eq"), formula = y ~ x, parse = TRUE) +  
  scale_color_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  scale_shape_manual(values = c("during" = 16, "after" = 17)) +  # Define shapes for mhw levels
  theme_minimal() +
  #coord_cartesian(ylim = c(5, 40), expand=TRUE) +
  labs(x = "Date",
       y = "Number of polyps",
       color = "Genet") +
  guides(shape = "none")

loop_count = 0
for (i in 1:length(label_plot)) {
  g <- g + geom_text(data = data.frame(date = as.Date("2023-12-30"), treatment = treatment_plot[i], mhw = "during"), 
              label=paste0("m = ", label_plot[i]), x = as.Date(date_plot[i]), y = 40-loop_count, hjust = 0, vjust = 1, color=color_plot[i])
  loop_count <- loop_count + 1
  if (loop_count > 4) {
    loop_count <- 0  # Reset counter after every 5th iteration
  }
}
g

#ggsave(here("experiment", "figures", "n_MHW_genet_combined.png"), width=12, height=7)

Plot behavior data

# %closed facet grid
ggplot(all, aes(x=genet, y=percent_closed, fill=genet, group=genet)) +
  geom_point(aes(color=genet)) +
  geom_violin(position="dodge", alpha=0.5, outlier.colour="transparent") +
  facet_grid(mhw~treatment)

# %closed facet wrap
ggplot(all, aes(x = genet, y = percent_closed, fill = genet, group = interaction(genet, mhw), shape = mhw)) +
  geom_point(aes(color = genet), position = position_dodge(width = 0.9)) +
  geom_violin(aes(size = mhw), position = position_dodge(width = 0.9), alpha = 0.5, outlier.colour = "transparent") +
  facet_wrap(~treatment) +
  scale_size_manual(values = c("during" = 1, "after" = 0.5)) + # Define sizes for mhw levels
  scale_shape_manual(values = c("during" = 16, "after" = 17))  # Define shapes for mhw levels

# %fullyOpen facet wrap
ggplot(all, aes(x = genet, y = percent_fully_open, fill = genet, group = interaction(genet, mhw), shape = mhw)) +
  geom_point(aes(color = genet), position = position_dodge(width = 0.9), alpha = 0.5) +
  geom_violin(aes(size = mhw), position = position_dodge(width = 0.9), alpha = 0.3, color = alpha("black", 0.75)) +
  facet_wrap(~treatment,
             labeller =  labeller(treatment = c("cold" = "Ambient", "severe" = "Severe MHW", "extreme" = "Extreme MHW"))) +
  scale_size_manual(values = c("during" = 1, "after" = 0.5), labels = c("During MHW", "After MHW")) + # Define sizes for mhw levels
  scale_shape_manual(values = c("during" = 16, "after" = 17), labels = c("During MHW", "After MHW")) + # Define shapes for mhw levels
  # change color of violin 
  scale_fill_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  scale_color_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  theme_minimal() +
  labs(x = "Genet",
       y = "% open polyps",
       color = "Genet",
       shape = "MHW",
       size = "MHW") +
  guides(fill = "none",
         color = "none")

#ggsave(here("experiment", "figures", "open_MHW_genet.png"), width=12, height=7)

Plot total biomass over time

ggplot(all, aes(x=date, y=total_biomass)) +
  geom_point(aes(color=genet), size=0.5) +
  geom_smooth(aes(fill=genet, color=genet), method="lm", se=T) +
  facet_wrap(~treatment,
             labeller =  labeller(treatment = c("cold" = "Ambient", "severe" = "Severe MHW", "extreme" = "Extreme MHW"))) +
  scale_fill_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  scale_color_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  theme_minimal() +
  labs(x = "Date",
       y = "Total biomass (g)",
       color = "Genet",
       fill = "Genet")

#ggsave(here("experiment", "figures", "total_biomass_mhw.png"), width=12, height=7)

Plot average body size over time

ggplot(all, aes(x=date, y=avg_size)) +
  geom_point(aes(color=genet), size=0.5) +
  geom_smooth(aes(fill=genet, color=genet), method="loess", se=T) +
  facet_wrap(~treatment,
             labeller =  labeller(treatment = c("cold" = "Ambient", "severe" = "Severe MHW", "extreme" = "Extreme MHW"))) +
  scale_fill_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  scale_color_manual(values = c("A" = "#9370DB", "B" = "#C21B78", "C" = "#FF9933", "D" ="#FF3333", "E" = "#662B45")) +
  theme_minimal() +
  labs(x = "Date",
       y = "Average body size (mm)",
       color = "Genet",
       fill = "Genet")

#ggsave(here("experiment", "figures", "avg_size_mhw.png"), width=12, height=7)

Plot mortality (lol)

ggplot(all, aes(x=date, y=dying_dead, color = treatment)) +
  geom_point(alpha=0.1) +
  facet_wrap(~treatment, 
             labeller = labeller(treatment = c("cold" = "Ambient", "severe" = "Severe MHW", "extreme" = "Extreme MHW"))) +
  geom_smooth(method = "lm", formula = y ~ x, se = F) +
  scale_color_manual(values = c("cold" = "#0072B2", "severe" = "#E69F00", "extreme" = "#D55E00")) +
  theme_minimal() +
  scale_y_continuous(breaks = seq(0, 10, 2), limits = c(0, 10), labels = function(x) format(x, nsmall = 0)) +
  labs(x = "Date",
       y = "Mortality (no. polyps)",
       color = "Treatment") +
  guides(color = "none")

#ggsave(here("experiment", "figures", "mortality_MHW.png"), width=12, height=7)

Start running stats

Count data - GLARMA

Body size data - Autoregression

Log transform data

all_size <- all %>%
  filter(!is.na(avg_size)) %>%
  mutate(avg_size_log = log(avg_size))

# scatter plot of transformed data
ggplot(all_size, aes(x=date, y=avg_size_log)) +
  geom_point(aes(color=treatment)) +
  labs(x = "Average body size (mm)",
       y = "Frequency")

# histogram of transformed data
ggplot(all_size, aes(x=avg_size_log)) +
  geom_histogram(binwidth=0.1) +
  labs(x = "log(Average body size (mm))",
       y = "Frequency")

# transformed data density plot
ggplot(all_size, aes(x = avg_size_log)) +
  geom_density() +
  ggtitle("Density Plot of Transformed Data")

# Q-Q plot of log-transformed data
qqnorm(all_size$avg_size_log)
qqline(all_size$avg_size_log, col = "red")

It’s not perfectly normal but definitely better log-transformed. Looks a bit like exponential decay, so let’s make the data stationary by differencing.

Difference to make the data stationary

all_size <- all_size %>%
  group_by(tank, genet) %>%
  mutate(avg_size_diff = avg_size_log - lag(avg_size_log),
         avg_size_diff2 = avg_size_diff - lag(avg_size_diff)) #need second order differencing for slope=0

ggplot(all_size, aes(x=date)) +
  geom_point(aes(y=avg_size_diff), color="black", alpha=0.3) +
  geom_point(aes(y=avg_size_diff2), color="blue", alpha=0.3) +
  geom_smooth(aes(y=avg_size_diff), method="lm", color="black") +
  geom_smooth(aes(y=avg_size_diff2), method="lm", color="blue") +
  labs(x = "Average body size (mm)",
       y = "Frequency")

Plot a slightly more complex vis of the first differenced data

# Fit linear models and extract coefficients
models <- all_size %>%
  group_by(treatment, genet, mhw) %>%
  do(tidy(lm(avg_size_diff ~ date, data = .))) %>%
  select(treatment, genet, term, estimate) %>%
  pivot_wider(names_from = term, values_from = estimate) %>%
  mutate(equation = sprintf("y = %.2f + %.2e * x", `(Intercept)`, date)) %>%
  rename(slope = date) %>%
  ungroup() %>%
  group_by(mhw) %>%
  mutate(x_axis = as.POSIXct(ifelse(mhw == "during", "2023-10-15", "2024-01-15")),
         y_axis = rep(seq(from = 0.2, to = 0.5, length.out = 5), length.out = n())) %>%
  ungroup() %>%
  select(-mhw)

# Join regression equations with the original data
all_size_reg <- all_size %>%
  left_join(models, by = c("treatment", "genet"), relationship = "many-to-many")

ggplot(all_size_reg) +
  geom_point(aes(x=date, y=avg_size_diff, color=genet), alpha=0.3) +
  geom_smooth(aes(x=date, y=avg_size_diff, color=genet ,linetype = mhw), method="lm") +
  facet_wrap(~treatment) +
  geom_label_repel(data = models, aes(x=x_axis, y=y_axis, label = equation, color = genet), size = 3) +
  labs(x = "Date",
       y = "Differenced average body size (mm)") +
  theme_bw()

#ggsave(here("experiment", "figures", "body_size_regression.png"), width=15, height=7)

Run ADF and KPSS tests on first and second differenced data

# List to store results
adf_results <- list()

# Loop through each group
for (group in unique(all_size$treatment)) {
  group_data <- all_size %>%
    filter(treatment == group,
           !is.na(avg_size_diff2)) %>%
    pull(avg_size_diff2) #chnage this to avg_size_diff for first differenced data
  # Perform the ADF test
  adf_test <- adf.test(group_data, alternative = "stationary")
  # Store the result
  adf_results[[group]] <- adf_test
}

# Print results
adf_results
## $cold
## 
##  Augmented Dickey-Fuller Test
## 
## data:  group_data
## Dickey-Fuller = -7.5935, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $severe
## 
##  Augmented Dickey-Fuller Test
## 
## data:  group_data
## Dickey-Fuller = -5.3592, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## 
## 
## $extreme
## 
##  Augmented Dickey-Fuller Test
## 
## data:  group_data
## Dickey-Fuller = -4.415, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# List to store results
kpss_results <- list()
# Loop through each group
for (group in unique(all_size$treatment)) {
  group_data <- all_size %>%
    filter(treatment == group,
           !is.na(avg_size_diff2)) %>%
    pull(avg_size_diff2) #chnage this to avg_size_diff for first differenced data
  # Perform the KPSS test
  kpss_test <- ur.kpss(group_data, type = "mu")  # Use "tau" if you expect trend stationarity
  # Store the result
  kpss_results[[group]] <- summary(kpss_test)
}

# Print results
kpss_results
## $cold
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.03 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
## $severe
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0316 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
## 
## 
## $extreme
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0844 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

Both p-value is less than 0.01 and test statistic is lower than critical values, so second-order differenced data is now almost certainly stationary. First-order differencing only showed strong evidence for stationarity with the ADF test.

Plot ACF and PACF for the log-transformed body size data

Acf(all_size$avg_size_log, main = "ACF of Log-transformed Series")

Pacf(all_size$avg_size_log, main = "PACF of Log-transformed Series")

Acf(all_size$avg_size_diff, main = "ACF of Differenced Series")

Pacf(all_size$avg_size_diff, main = "PACF of Differenced Series")

Acf(all_size$avg_size_diff2, main = "ACF of 2-Differenced Series")

Pacf(all_size$avg_size_diff2, main = "PACF of 2-Differenced Series")

There is definitely a lot of autocorrelation with the data, for both 1-diff and 2-diff. Makes sense because body size should be highly correlated over time.

Remove NA values and convert factors to numeric

all_size_na <- all_size %>%
  filter(!is.na(avg_size_diff)) %>%
  mutate(genet = as.numeric(genet),
         treatment = as.numeric(treatment),
         mhw = as.numeric(mhw)) %>%
  select(date, tank, genet, n, treatment, mhw, avg_size, avg_size_log, avg_size_diff, avg_size_diff2)

First-differenced data

Auto ARIMA - basic

autoArima <- auto.arima(all_size_na$avg_size_diff, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(autoArima)
## Series: all_size_na$avg_size_diff 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1     ma2
##       0.7682  -1.5296  0.5354
## s.e.  0.0518   0.0662  0.0646
## 
## sigma^2 = 0.002058:  log likelihood = 2258.51
## AIC=-4509.02   AICc=-4508.99   BIC=-4488.19
## 
## Training set error measures:
##                       ME       RMSE        MAE MPE MAPE      MASE         ACF1
## Training set 0.001281871 0.04529954 0.03479757 Inf  Inf 0.7851294 -0.009148644

I ran this all again with second-differenced data, and the model doesn’t appear significantly better (in some cases like BIC, not better) and so it doesn’t justify the added complexity of a second-differenced model. so let’s stick with first-differenced model.

Check model residuals

residuals_auto <- residuals(autoArima)
plot(residuals_auto, main = "Residuals of AUTOSARIMA Model")

Acf(residuals_auto, main = "ACF of Residuals")

Box.test(residuals_auto, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals_auto
## X-squared = 0.11324, df = 1, p-value = 0.7365
adf_test_residuals <- adf.test(residuals_auto, alternative = "stationary") #definitely stationary
qqnorm(residuals_auto)
qqline(residuals_auto)

shapiro.test(residuals_auto)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals_auto
## W = 0.99054, p-value = 1.217e-07
ggplot() +
  geom_histogram(aes(x = residuals_auto), bins = 20, color = "black") +
  labs(x = "Residuals",
       y = "Frequency")

fitted_values <- fitted(autoArima)
plot(fitted_values, residuals_auto, main="Residuals vs Fitted Values")
abline(h=0, col="red")

2-order differencing is not significantly better than 1-order differencing so let’s stick with 1-order.

AR(1) model: This is autoregressive data, and the current value of the time series depends linearly on its immediately preceding value - makes sense because body size shouldn’t change that drastically week by week. I(1): First-order differencing applied to make the series stationary - which makes sense because preliminary analyses showed slope of zero with second-differenced data. MA(2): current value of the time series depends on the last two periods’ forecast errors - means that the current value of the series depends on the errors made in the previous two periods, which doesn’t make a ton of intuitive sense except maybe if there are big changes to body size (i.e. reproduction?) that just means we won’t see those changes finish until 2 weeks later? <<< CHECK THIS LOGIC.

Check ts trends/seasonality

all_size_ts <- ts(all_size_na$avg_size_diff, frequency = 52) #weekly data as ts object
class(all_size_ts)
## [1] "ts"
summary(all_size_ts)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.325110 -0.060547 -0.027665 -0.030876  0.002888  0.168555
acf(all_size_ts)

pacf(all_size_ts)

decomposition <- stl(all_size_ts, s.window = "periodic")
plot(decomposition)

There’s no apparent seasonality and a slight positive trend in the data

Fit ARIMA with different I values

# Fit ARIMA with different seasonal orders
model1 <- Arima(all_size_na$avg_size_diff, order = c(1, 1, 1))
model2 <- Arima(all_size_na$avg_size_diff, order = c(1, 1, 2))
model3 <- Arima(all_size_na$avg_size_diff, order = c(1, 1, 3))

Acf(residuals(model1), main = paste("ACF of Residuals model1"))

Pacf(residuals(model1), main = paste("PACF of Residuals model1"))

Acf(residuals(model2), main = paste("ACF of Residuals model2"))

Pacf(residuals(model2), main = paste("PACF of Residuals model2"))

Acf(residuals(model3), main = paste("ACF of Residuals model3"))

Pacf(residuals(model3), main = paste("PACF of Residuals model3"))

Box.test(residuals(model1), lag = 20, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(model1)
## X-squared = 48.604, df = 20, p-value = 0.0003498
Box.test(residuals(model2), lag = 20, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(model2)
## X-squared = 30.952, df = 20, p-value = 0.05582
Box.test(residuals(model3), lag = 20, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(model3)
## X-squared = 30.904, df = 20, p-value = 0.05647

I(2) is definitely the best fit.

Consider potential covariates

#create matrix of dummy variables incorporating all potential covariates
xreg_All <- model.matrix(~ treatment + genet + mhw - 1, data = all_size_na)
xreg_NoGenet <- model.matrix(~ treatment + mhw - 1, data = all_size_na)
xreg_MHW <- model.matrix(~ mhw - 1, data = all_size_na)
xreg_Treatment <- model.matrix(~ treatment - 1, data = all_size_na)

arima_covariates_all <- Arima(all_size_na$avg_size_diff, order = c(1, 1, 2), xreg = xreg_All)
summary(arima_covariates_all)
## Series: all_size_na$avg_size_diff 
## Regression with ARIMA(1,1,2) errors 
## 
## Coefficients:
##          ar1      ma1     ma2  treatment   genet     mhw
##       0.7657  -1.5291  0.5349    -0.0029  -3e-04  0.0011
## s.e.  0.0535   0.0676  0.0657     0.0025   8e-04  0.0154
## 
## sigma^2 = 0.00206:  log likelihood = 2259.25
## AIC=-4504.5   AICc=-4504.41   BIC=-4468.05
## 
## Training set error measures:
##                       ME       RMSE        MAE MPE MAPE      MASE         ACF1
## Training set 0.001340997 0.04527438 0.03478043 NaN  Inf 0.7847427 -0.008252592
arima_covariates_noGenet <- Arima(all_size_na$avg_size_diff, order = c(1, 1, 2), xreg = xreg_NoGenet)
summary(arima_covariates_noGenet)
## Series: all_size_na$avg_size_diff 
## Regression with ARIMA(1,1,2) errors 
## 
## Coefficients:
##          ar1      ma1     ma2  treatment     mhw
##       0.7661  -1.5293  0.5351    -0.0029  0.0012
## s.e.  0.0539   0.0682  0.0662     0.0025  0.0155
## 
## sigma^2 = 0.002059:  log likelihood = 2259.17
## AIC=-4506.35   AICc=-4506.28   BIC=-4475.1
## 
## Training set error measures:
##                       ME       RMSE        MAE MPE MAPE      MASE         ACF1
## Training set 0.001324029 0.04527702 0.03477904 NaN  Inf 0.7847113 -0.008522662
arima_covariates_mhw <- Arima(all_size_na$avg_size_diff, order = c(1, 1, 2), xreg = xreg_MHW)
summary(arima_covariates_mhw)
## Series: all_size_na$avg_size_diff 
## Regression with ARIMA(1,1,2) errors 
## 
## Coefficients:
##          ar1      ma1     ma2     mhw
##       0.7750  -1.5384  0.5437  0.0039
## s.e.  0.0512   0.0657  0.0641  0.0149
## 
## sigma^2 = 0.002059:  log likelihood = 2258.54
## AIC=-4507.08   AICc=-4507.04   BIC=-4481.05
## 
## Training set error measures:
##                       ME       RMSE        MAE MPE MAPE      MASE         ACF1
## Training set 0.001260464 0.04529743 0.03478482 Inf  Inf 0.7848417 -0.007205159
arima_covariates_treatment <- Arima(all_size_na$avg_size_diff, order = c(1, 1, 2), xreg = xreg_Treatment)
summary(arima_covariates_treatment)
## Series: all_size_na$avg_size_diff 
## Regression with ARIMA(1,1,2) errors 
## 
## Coefficients:
##          ar1      ma1     ma2  treatment
##       0.7651  -1.5282  0.5341    -0.0029
## s.e.  0.0520   0.0663  0.0646     0.0025
## 
## sigma^2 = 0.002058:  log likelihood = 2259.17
## AIC=-4508.34   AICc=-4508.3   BIC=-4482.3
## 
## Training set error measures:
##                      ME       RMSE        MAE MPE MAPE      MASE         ACF1
## Training set 0.00134849 0.04527726 0.03478006 NaN  Inf 0.7847344 -0.008539237

Looks like genet is not a significant covariate, but treatment and during/after MHW are slightly impactful. <<< DISCUSS WITH CHRIS.

Auto-ARIMA with Covariates model (check AR, I, MA)

xreg_Covariate <- model.matrix(~ genet + mhw - 1, data = all_size_na)

autoArima_Covariate <- auto.arima(all_size_na$avg_size_diff, seasonal = TRUE, stepwise = FALSE, approximation = FALSE, xreg = xreg_Covariate)
summary(autoArima_Covariate)
## Series: all_size_na$avg_size_diff 
## Regression with ARIMA(1,1,2) errors 
## 
## Coefficients:
##          ar1      ma1     ma2   genet     mhw
##       0.7673  -1.5283  0.5340  -3e-04  0.0016
## s.e.  0.0532   0.0675  0.0657   8e-04  0.0153
## 
## sigma^2 = 0.002061:  log likelihood = 2258.6
## AIC=-4505.2   AICc=-4505.14   BIC=-4473.96
## 
## Training set error measures:
##                       ME       RMSE        MAE MPE MAPE      MASE         ACF1
## Training set 0.001274559 0.04529606 0.03479419 Inf  Inf 0.7850531 -0.009482209

No change to AR, I, or MA with any of the potential covariates.

So, for first-differenced data, here are the two model options #### Basic ARIMA(1,1,2): Coefficients: ar1 ma1 ma2 0.7682 -1.5296 0.5354 s.e. 0.0518 0.0662 0.0646

sigma^2 = 0.002058: log likelihood = 2258.51 AIC=-4509.02 AICc=-4508.99 BIC=-4488.19

                  ME       RMSE        MAE MPE MAPE      MASE         ACF1

Training set 0.001281871 0.04529954 0.03479757 Inf Inf 0.7851294 -0.009148644

ARIMA(1,1,2) with MHW/Treatment covariates: Coefficients: ar1 ma1 ma2 treatment mhw 0.7661 -1.5293 0.5351 -0.0029 0.0012 s.e. 0.0539 0.0682 0.0662 0.0025 0.0155

sigma^2 = 0.002059: log likelihood = 2259.17 AIC=-4506.35 AICc=-4506.28 BIC=-4475.1

                  ME       RMSE        MAE MPE MAPE      MASE         ACF1

Training set 0.001324029 0.04527702 0.03477904 NaN Inf 0.7847113 -0.008522662 ####

Check model residuals for MHW/Treatment ARIMA

residuals_auto <- residuals(arima_covariates_noGenet)
plot(residuals_auto, main = "Residuals of AUTOSARIMA Model")

Acf(residuals_auto, main = "ACF of Residuals")

Box.test(residuals_auto, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals_auto
## X-squared = 0.098276, df = 1, p-value = 0.7539
adf_test_residuals <- adf.test(residuals_auto, alternative = "stationary") #definitely stationary
qqnorm(residuals_auto)
qqline(residuals_auto)

shapiro.test(residuals_auto)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals_auto
## W = 0.99067, p-value = 1.463e-07
ggplot() +
  geom_histogram(aes(x = residuals_auto), bins = 20, color = "black") +
  labs(x = "Residuals",
       y = "Frequency")

fitted_values <- fitted(arima_covariates_noGenet)
plot(fitted_values, residuals_auto, main="Residuals vs Fitted Values")
abline(h=0, col="red")

Now, let’s check out ARIMA models on second-differenced data (since it is technically more stationary)

autoArima2 <- auto.arima(all_size_na$avg_size_diff2, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(autoArima2)
## Series: all_size_na$avg_size_diff2 
## ARIMA(1,0,4) with zero mean 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4
##       0.9289  -0.7029  0.0217  -0.0357  -0.1054
## s.e.  0.0412   0.0506  0.0356   0.0353   0.0303
## 
## sigma^2 = 0.005237:  log likelihood = 1541.35
## AIC=-3070.71   AICc=-3070.64   BIC=-3039.8
## 
## Training set error measures:
##                        ME      RMSE        MAE       MPE    MAPE      MASE
## Training set 0.0007083893 0.0722271 0.05541204 -20.12162 319.208 0.7668489
##                     ACF1
## Training set 0.000179781

AR(1) model: This is autoregressive data, and the current value of the time series depends linearly on its immediately preceding value - makes sense because body size shouldn’t change that drastically week by week. I(0): No differencing applied to make the series stationary - which makes sense because the body size data has been differenced twice which we know makes it stationary. MA(4): current value of the time series depends on the last FOUR periods’ forecast errors - means that the current value of the series depends on the errors made in the previous four periods. Greater value than first-differenced data model MA(2). Meaning??

Check model residuals

residuals_auto2 <- residuals(autoArima2)
plot(residuals_auto2, main = "Residuals of AUTOSARIMA Model")

Acf(residuals_auto2, main = "ACF of Residuals")

Box.test(residuals_auto2, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals_auto2
## X-squared = 4.1307e-05, df = 1, p-value = 0.9949
adf_test_residuals2 <- adf.test(residuals_auto2, alternative = "stationary") #definitely stationary
qqnorm(residuals_auto2)
qqline(residuals_auto2)

shapiro.test(residuals_auto2)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals_auto2
## W = 0.99254, p-value = 4.801e-06
ggplot() +
  geom_histogram(aes(x = residuals_auto2), bins = 20, color = "black") +
  labs(x = "Residuals",
       y = "Frequency")

fitted_values2 <- fitted(autoArima2)
plot(fitted_values2, residuals_auto2, main="Residuals vs Fitted Values")
abline(h=0, col="red")

Shapiro-Wilks test says residuals are close to normal distribution but not normally distributed… but the plots look fine.

Consider potential covariates

#create matrix of dummy variables incorporating all potential covariates
xreg_All <- model.matrix(~ treatment + genet + mhw - 1, data = all_size_na)
xreg_NoGenet <- model.matrix(~ treatment + mhw - 1, data = all_size_na)
xreg_NoTreatment <- model.matrix(~ genet + mhw - 1, data = all_size_na)
xreg_NoMHW <- model.matrix(~ treatment + genet - 1, data = all_size_na)
xreg_MHW <- model.matrix(~ mhw - 1, data = all_size_na)
xreg_Treatment <- model.matrix(~ treatment - 1, data = all_size_na)
xreg_Genet <- model.matrix(~ genet - 1, data = all_size_na)

arima_no_covariates2 <- Arima(all_size_na$avg_size_diff2, order = c(1, 0, 4))
summary(arima_no_covariates2)
## Series: all_size_na$avg_size_diff2 
## ARIMA(1,0,4) with non-zero mean 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4    mean
##       0.9271  -0.7011  0.0222  -0.0353  -0.1049  0.0018
## s.e.  0.0425   0.0517  0.0356   0.0353   0.0304  0.0050
## 
## sigma^2 = 0.005241:  log likelihood = 1541.42
## AIC=-3068.84   AICc=-3068.75   BIC=-3032.79
## 
## Training set error measures:
##                         ME       RMSE        MAE      MPE     MAPE      MASE
## Training set -3.024331e-05 0.07222344 0.05540808 167.9321 253.6381 0.7754863
##                      ACF1
## Training set 0.0002677154
arima_covariates_all2 <- Arima(all_size_na$avg_size_diff2, order = c(1, 0, 4), xreg = xreg_All)
summary(arima_covariates_all2)
## Series: all_size_na$avg_size_diff2 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4  intercept  treatment   genet
##       0.9284  -0.7025  0.0218  -0.0357  -0.1054     0.0022    -0.0006  0.0001
## s.e.  0.0416   0.0509  0.0356   0.0353   0.0304     0.0185     0.0042  0.0012
##          mhw
##       0.0003
## s.e.  0.0099
## 
## sigma^2 = 0.005253:  log likelihood = 1541.44
## AIC=-3062.88   AICc=-3062.7   BIC=-3011.37
## 
## Training set error measures:
##                         ME       RMSE       MAE      MPE     MAPE      MASE
## Training set -2.928559e-05 0.07222242 0.0554114 167.5572 253.8037 0.7755327
##                      ACF1
## Training set 0.0002797304
arima_covariates_noGenet2 <- Arima(all_size_na$avg_size_diff2, order = c(1, 0, 4), xreg = xreg_NoGenet)
summary(arima_covariates_noGenet2)
## Series: all_size_na$avg_size_diff2 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4  intercept  treatment     mhw
##       0.9296  -0.7038  0.0216  -0.0361  -0.1057     0.0025    -0.0006  0.0004
## s.e.  0.0407   0.0501  0.0355   0.0352   0.0303     0.0182     0.0042  0.0099
## 
## sigma^2 = 0.005249:  log likelihood = 1541.43
## AIC=-3064.86   AICc=-3064.72   BIC=-3018.5
## 
## Training set error measures:
##                         ME       RMSE        MAE      MPE     MAPE      MASE
## Training set -2.762402e-05 0.07222283 0.05540972 166.8089 254.3437 0.7755092
##                      ACF1
## Training set 0.0003822157
arima_covariates_noTreatment2 <- Arima(all_size_na$avg_size_diff2, order = c(1, 0, 4), xreg = xreg_NoTreatment)
summary(arima_covariates_noTreatment2)
## Series: all_size_na$avg_size_diff2 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4  intercept   genet     mhw
##       0.9266  -0.7006  0.0223  -0.0352  -0.1047     0.0011  0.0001  0.0003
## s.e.  0.0428   0.0520  0.0356   0.0353   0.0305     0.0163  0.0012  0.0098
## 
## sigma^2 = 0.005249:  log likelihood = 1541.43
## AIC=-3064.85   AICc=-3064.71   BIC=-3018.5
## 
## Training set error measures:
##                         ME       RMSE        MAE      MPE     MAPE      MASE
## Training set -6.590817e-05 0.07222308 0.05541425 168.3794 253.2696 0.7755727
##                    ACF1
## Training set 0.00023234
arima_covariates_noMHW2 <- Arima(all_size_na$avg_size_diff2, order = c(1, 0, 4), xreg = xreg_NoMHW)
summary(arima_covariates_noMHW2)
## Series: all_size_na$avg_size_diff2 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4  intercept  treatment   genet
##       0.9276  -0.7017  0.0220  -0.0354  -0.1051     0.0025    -0.0006  0.0001
## s.e.  0.0423   0.0515  0.0356   0.0353   0.0304     0.0104     0.0042  0.0012
## 
## sigma^2 = 0.005249:  log likelihood = 1541.44
## AIC=-3064.87   AICc=-3064.73   BIC=-3018.52
## 
## Training set error measures:
##                        ME       RMSE        MAE      MPE     MAPE      MASE
## Training set 2.477606e-05 0.07222246 0.05540808 167.4926 253.5783 0.7754863
##                      ACF1
## Training set 0.0002554762
arima_covariates_Genet2 <- Arima(all_size_na$avg_size_diff2, order = c(1, 0, 4), xreg = xreg_Genet)
summary(arima_covariates_Genet2)
## Series: all_size_na$avg_size_diff2 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4  intercept   genet
##       0.9272  -0.7013  0.0221  -0.0353  -0.1050     0.0014  0.0001
## s.e.  0.0424   0.0516  0.0356   0.0353   0.0304     0.0062  0.0012
## 
## sigma^2 = 0.005245:  log likelihood = 1541.43
## AIC=-3066.85   AICc=-3066.74   BIC=-3025.65
## 
## Training set error measures:
##                         ME       RMSE        MAE      MPE     MAPE      MASE
## Training set -3.658259e-05 0.07222305 0.05541292 168.5765 253.4582 0.7755541
##                      ACF1
## Training set 0.0002772473
arima_covariates_mhw2 <- Arima(all_size_na$avg_size_diff2, order = c(1, 0, 4), xreg = xreg_MHW)
summary(arima_covariates_mhw2)
## Series: all_size_na$avg_size_diff2 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4  intercept     mhw
##       0.9276  -0.7017  0.0220  -0.0355  -0.1051     0.0014  0.0002
## s.e.  0.0420   0.0513  0.0356   0.0353   0.0304     0.0159  0.0099
## 
## sigma^2 = 0.005245:  log likelihood = 1541.42
## AIC=-3066.84   AICc=-3066.73   BIC=-3025.64
## 
## Training set error measures:
##                        ME       RMSE        MAE      MPE     MAPE      MASE
## Training set 1.976376e-05 0.07222341 0.05540994 167.5124 253.7161 0.7755123
##                      ACF1
## Training set 0.0002668085
arima_covariates_treatment2 <- Arima(all_size_na$avg_size_diff2, order = c(1, 0, 4), xreg = xreg_Treatment)
summary(arima_covariates_treatment2)
## Series: all_size_na$avg_size_diff2 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4  intercept  treatment
##       0.9287  -0.7029  0.0217  -0.0357  -0.1055     0.0031    -0.0006
## s.e.  0.0414   0.0507  0.0356   0.0353   0.0303     0.0098     0.0042
## 
## sigma^2 = 0.005245:  log likelihood = 1541.43
## AIC=-3066.86   AICc=-3066.75   BIC=-3025.66
## 
## Training set error measures:
##                         ME       RMSE        MAE      MPE     MAPE      MASE
## Training set -1.102096e-05 0.07222282 0.05540557 167.0637 254.1057 0.7754512
##                      ACF1
## Training set 0.0003115088

TreatmentxGenetxMHW model performs the worst. TreatmentxMHW, GenetxMHW, TreatmentxGenet perform second worst. No covariates performs best, Treatment alone, Genet alone, and MHW alone all perform second best. This probably means we should use no covariates in second-differenced data model?

Auto-ARIMA with Covariates model (check AR, I, MA)

xreg_Covariate2 <- model.matrix(~ treatment + genet + mhw - 1, data = all_size_na)

autoArima_Covariate2 <- auto.arima(all_size_na$avg_size_diff2, seasonal = TRUE, stepwise = FALSE, approximation = FALSE, xreg = xreg_Covariate2)
summary(autoArima_Covariate2)
## Series: all_size_na$avg_size_diff2 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3      ma4  treatment   genet     mhw
##       0.9280  -0.7021  0.0219  -0.0356  -0.1052    -0.0004  0.0002  0.0013
## s.e.  0.0419   0.0511  0.0356   0.0353   0.0304     0.0037  0.0012  0.0056
## 
## sigma^2 = 0.005249:  log likelihood = 1541.43
## AIC=-3064.86   AICc=-3064.72   BIC=-3018.5
## 
## Training set error measures:
##                        ME       RMSE        MAE       MPE     MAPE      MASE
## Training set 1.247646e-05 0.07222285 0.05542077 -18.00826 321.5899 0.7669697
##                      ACF1
## Training set 0.0002889995

Same as first-differenced data, no change to AR, I, or MA with any of the potential covariates.

For second-differenced data, here is the apparent best model option #### Series: all_size_na$avg_size_diff2 ARIMA(1,0,4) with zero mean Coefficients: ar1 ma1 ma2 ma3 ma4 0.9289 -0.7029 0.0217 -0.0357 -0.1054 s.e. 0.0412 0.0506 0.0356 0.0353 0.0303

sigma^2 = 0.005237: log likelihood = 1541.35 AIC=-3070.71 AICc=-3070.64 BIC=-3039.8

Training set error measures: ME RMSE MAE MPE MAPE MASE Training set 0.0007083893 0.0722271 0.05541204 -20.12162 319.208 0.7668489 ACF1 Training set 0.000179781 ####

Plot differenced data with O.G. values

Genet only

# Create the plots
plot_undifferenced <- ggplot(all_size_na, aes(x = date, y = avg_size_log, group=as.factor(genet))) +
  geom_point(aes(color=as.factor(genet)), alpha=0.3) +
  geom_smooth(aes(color=as.factor(genet)), method="lm") +
  ggtitle("Undifferenced Data") +
  xlab("Date") +
  ylab("Log Body size (mm)") +
  theme_bw()

plot_differenced <- ggplot(all_size_na, aes(x = date, y = avg_size_diff, group=as.factor(genet))) +
  geom_point(aes(color=as.factor(genet)), alpha=0.3) +
  geom_smooth(aes(color=as.factor(genet)), method="lm") +
  ggtitle("Differenced Data") +
  xlab("Date") +
  ylab("Differenced Body Size (mm)") +
  theme_bw()

plot_differenced2 <- ggplot(all_size_na, aes(x = date, y = avg_size_diff2, group=as.factor(genet))) +
  geom_point(aes(color=as.factor(genet)), alpha=0.3) +
  geom_smooth(aes(color=as.factor(genet)), method="lm") +
  ggtitle("2-Differenced Data") +
  xlab("Date") +
  ylab("2-Differenced Body Size (mm)") +
  theme_bw()

combined_plot <- grid.arrange(plot_undifferenced, plot_differenced, plot_differenced2, ncol = 3)

#ggsave(here("experiment", "figures", "differenced_comparison_genet.png"), combined_plot, width=14, height=7)

Well damn, that makes sense why genet is not a significant covariate in the differenced data models!

Treatment only

# Create the plots
plot_undifferenced <- ggplot(all_size_na, aes(x = date, y = avg_size_log, group=as.factor(treatment))) +
  geom_point(aes(color=as.factor(treatment)), alpha=0.3) +
  geom_smooth(aes(color=as.factor(treatment)), method="lm") +
  ggtitle("Undifferenced Data") +
  xlab("Date") +
  ylab("Log Body size (mm)") +
  theme_bw()

plot_differenced <- ggplot(all_size_na, aes(x = date, y = avg_size_diff, group=as.factor(treatment))) +
  geom_point(aes(color=as.factor(treatment)), alpha=0.3) +
  geom_smooth(aes(color=as.factor(treatment)), method="lm") +
  ggtitle("Differenced Data") +
  xlab("Date") +
  ylab("Differenced Body Size (mm)") +
  theme_bw()

plot_differenced2 <- ggplot(all_size_na, aes(x = date, y = avg_size_diff2, group=as.factor(treatment))) +
  geom_point(aes(color=as.factor(treatment)), alpha=0.3) +
  geom_smooth(aes(color=as.factor(treatment)), method="lm") +
  ggtitle("2-Differenced Data") +
  xlab("Date") +
  ylab("2-Differenced Body Size (mm)") +
  theme_bw()

combined_plot <- grid.arrange(plot_undifferenced, plot_differenced, plot_differenced2, ncol = 3)

#ggsave(here("experiment", "figures", "differenced_comparison_treatment.png"), combined_plot, width=14, height=7)

No difference in linear slope between three treatments

MHW only

# Create the plots
plot_undifferenced <- ggplot(all_size_na, aes(x = date, y = avg_size_log, group=as.factor(mhw))) +
  geom_point(aes(color=as.factor(mhw)), alpha=0.3) +
  geom_smooth(aes(color=as.factor(mhw)), method="lm") +
  ggtitle("Undifferenced Data") +
  xlab("Date") +
  ylab("Log Body size (mm)") +
  theme_bw()

plot_differenced <- ggplot(all_size_na, aes(x = date, y = avg_size_diff, group=as.factor(mhw))) +
  geom_point(aes(color=as.factor(mhw)), alpha=0.3) +
  geom_smooth(aes(color=as.factor(mhw)), method="lm") +
  ggtitle("Differenced Data") +
  xlab("Date") +
  ylab("Differenced Body Size (mm)") +
  theme_bw()

plot_differenced2 <- ggplot(all_size_na, aes(x = date, y = avg_size_diff2, group=as.factor(mhw))) +
  geom_point(aes(color=as.factor(mhw)), alpha=0.3) +
  geom_smooth(aes(color=as.factor(mhw)), method="lm") +
  ggtitle("2-Differenced Data") +
  xlab("Date") +
  ylab("2-Differenced Body Size (mm)") +
  theme_bw()

combined_plot <- grid.arrange(plot_undifferenced, plot_differenced, plot_differenced2, ncol = 3)

#ggsave(here("experiment", "figures", "differenced_comparison_mhw.png"), combined_plot, width=14, height=7)

MHW slope is definitely different in undifferenced data.

So… MHW and Genet do appear different in the raw body size data. But that doesn’t show up in the Differenced data… how do I reconcile this for the ARIMA? ###

Use basic first-differenced ARIMA to forecast.

future_predictions <- forecast(autoArima, h = 50)  # Forecast h periods ahead

plot(future_predictions)

Just for conceptual understanding, create a quick autoARIMA model on undifferenced data and plot forecast

autoArima_undiff <- auto.arima(all_size_na$avg_size_log, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(autoArima_undiff)
## Series: all_size_na$avg_size_log 
## ARIMA(5,1,0) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5
##       -0.8009  -0.7477  -0.8069  -0.5740  -0.0480
## s.e.   0.0272   0.0312   0.0301   0.0312   0.0273
## 
## sigma^2 = 0.008836:  log likelihood = 1276.73
## AIC=-2541.47   AICc=-2541.4   BIC=-2510.22
## 
## Training set error measures:
##                        ME       RMSE        MAE        MPE     MAPE      MASE
## Training set -0.001503275 0.09379348 0.07551933 -0.4226821 4.927029 0.6617219
##                     ACF1
## Training set 0.001854019
future_predictions <- forecast(autoArima_undiff, h = 50)  # Forecast h periods ahead
plot(future_predictions)